Loading Packages

if (class(try(require(gdata))) == "try-error") install.packages("gdata")
if (class(try(require(ggplot2))) == "try-error") install.packages("ggplot2")
if (class(try(require(plotly))) == "try-error") install.packages("plotly")
if (class(try(require(dplyr))) == "try-error") install.packages("dplyr")
if (class(try(require(DT))) == "try-error") install.packages("DT")
if (class(try(require(reshape))) == "try-error") install.packages("reshape")

Reading the Data

testData <- read.csv("./data/cs-test.csv")
trainData <- read.csv("./data/cs-training.csv")
dataDict <- read.xls("./data/Data Dictionary.xls", sheet = 1)

Data Dictionary

DT::datatable(data = dataDict)

Train Data Table

DT::datatable(data = trainData, options = list(scrollX = TRUE, scrollY = TRUE, 
    scrollCollapse = TRUE, autoWidth = TRUE, fixedColumns = list(leftColumns = 1)))

Data Summary

Summary all Data

summary(trainData)
##        X          SeriousDlqin2yrs  RevolvingUtilizationOfUnsecuredLines
##  Min.   :     1   Min.   :0.00000   Min.   :    0.00                    
##  1st Qu.: 37501   1st Qu.:0.00000   1st Qu.:    0.03                    
##  Median : 75000   Median :0.00000   Median :    0.15                    
##  Mean   : 75000   Mean   :0.06684   Mean   :    6.05                    
##  3rd Qu.:112500   3rd Qu.:0.00000   3rd Qu.:    0.56                    
##  Max.   :150000   Max.   :1.00000   Max.   :50708.00                    
##                                                                         
##       age        NumberOfTime30.59DaysPastDueNotWorse   DebtRatio       
##  Min.   :  0.0   Min.   : 0.000                       Min.   :     0.0  
##  1st Qu.: 41.0   1st Qu.: 0.000                       1st Qu.:     0.2  
##  Median : 52.0   Median : 0.000                       Median :     0.4  
##  Mean   : 52.3   Mean   : 0.421                       Mean   :   353.0  
##  3rd Qu.: 63.0   3rd Qu.: 0.000                       3rd Qu.:     0.9  
##  Max.   :109.0   Max.   :98.000                       Max.   :329664.0  
##                                                                         
##  MonthlyIncome     NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
##  Min.   :      0   Min.   : 0.000                  Min.   : 0.000         
##  1st Qu.:   3400   1st Qu.: 5.000                  1st Qu.: 0.000         
##  Median :   5400   Median : 8.000                  Median : 0.000         
##  Mean   :   6670   Mean   : 8.453                  Mean   : 0.266         
##  3rd Qu.:   8249   3rd Qu.:11.000                  3rd Qu.: 0.000         
##  Max.   :3008750   Max.   :58.000                  Max.   :98.000         
##  NA's   :29731                                                            
##  NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
##  Min.   : 0.000               Min.   : 0.0000                     
##  1st Qu.: 0.000               1st Qu.: 0.0000                     
##  Median : 1.000               Median : 0.0000                     
##  Mean   : 1.018               Mean   : 0.2404                     
##  3rd Qu.: 2.000               3rd Qu.: 0.0000                     
##  Max.   :54.000               Max.   :98.0000                     
##                                                                   
##  NumberOfDependents
##  Min.   : 0.000    
##  1st Qu.: 0.000    
##  Median : 0.000    
##  Mean   : 0.757    
##  3rd Qu.: 1.000    
##  Max.   :20.000    
##  NA's   :3924

Summary Positive Data

summary(trainData %>% dplyr::filter(SeriousDlqin2yrs == 1))
##        X          SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines
##  Min.   :     1   Min.   :1        Min.   :   0.000                    
##  1st Qu.: 38257   1st Qu.:1        1st Qu.:   0.398                    
##  Median : 75283   Median :1        Median :   0.839                    
##  Mean   : 75454   Mean   :1        Mean   :   4.367                    
##  3rd Qu.:112962   3rd Qu.:1        3rd Qu.:   1.000                    
##  Max.   :149980   Max.   :1        Max.   :8328.000                    
##                                                                        
##       age         NumberOfTime30.59DaysPastDueNotWorse   DebtRatio       
##  Min.   : 21.00   Min.   : 0.000                       Min.   :    0.00  
##  1st Qu.: 36.00   1st Qu.: 0.000                       1st Qu.:    0.19  
##  Median : 45.00   Median : 0.000                       Median :    0.43  
##  Mean   : 45.93   Mean   : 2.388                       Mean   :  295.12  
##  3rd Qu.: 54.00   3rd Qu.: 2.000                       3rd Qu.:    0.89  
##  Max.   :101.00   Max.   :98.000                       Max.   :38793.00  
##                                                                          
##  MonthlyIncome    NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
##  Min.   :     0   Min.   : 0.000                  Min.   : 0.000         
##  1st Qu.:  2963   1st Qu.: 4.000                  1st Qu.: 0.000         
##  Median :  4500   Median : 7.000                  Median : 0.000         
##  Mean   :  5631   Mean   : 7.882                  Mean   : 2.091         
##  3rd Qu.:  6800   3rd Qu.:11.000                  3rd Qu.: 1.000         
##  Max.   :250000   Max.   :57.000                  Max.   :98.000         
##  NA's   :1669                                                            
##  NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
##  Min.   : 0.0000              Min.   : 0.000                      
##  1st Qu.: 0.0000              1st Qu.: 0.000                      
##  Median : 1.0000              Median : 0.000                      
##  Mean   : 0.9885              Mean   : 1.828                      
##  3rd Qu.: 2.0000              3rd Qu.: 1.000                      
##  Max.   :29.0000              Max.   :98.000                      
##                                                                   
##  NumberOfDependents
##  Min.   :0.0000    
##  1st Qu.:0.0000    
##  Median :0.0000    
##  Mean   :0.9482    
##  3rd Qu.:2.0000    
##  Max.   :8.0000    
##  NA's   :179

Summary Negative Data

summary(trainData %>% dplyr::filter(SeriousDlqin2yrs != 1))
##        X          SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines
##  Min.   :     2   Min.   :0        Min.   :    0.00                    
##  1st Qu.: 37453   1st Qu.:0        1st Qu.:    0.03                    
##  Median : 74982   Median :0        Median :    0.13                    
##  Mean   : 74968   Mean   :0        Mean   :    6.17                    
##  3rd Qu.:112465   3rd Qu.:0        3rd Qu.:    0.49                    
##  Max.   :150000   Max.   :0        Max.   :50708.00                    
##                                                                        
##       age         NumberOfTime30.59DaysPastDueNotWorse   DebtRatio       
##  Min.   :  0.00   Min.   : 0.0000                      Min.   :     0.0  
##  1st Qu.: 42.00   1st Qu.: 0.0000                      1st Qu.:     0.2  
##  Median : 52.00   Median : 0.0000                      Median :     0.4  
##  Mean   : 52.75   Mean   : 0.2801                      Mean   :   357.2  
##  3rd Qu.: 63.00   3rd Qu.: 0.0000                      3rd Qu.:     0.9  
##  Max.   :109.00   Max.   :98.0000                      Max.   :329664.0  
##                                                                          
##  MonthlyIncome     NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
##  Min.   :      0   Min.   : 0.000                  Min.   : 0.0000        
##  1st Qu.:   3461   1st Qu.: 5.000                  1st Qu.: 0.0000        
##  Median :   5466   Median : 8.000                  Median : 0.0000        
##  Mean   :   6748   Mean   : 8.494                  Mean   : 0.1352        
##  3rd Qu.:   8333   3rd Qu.:11.000                  3rd Qu.: 0.0000        
##  Max.   :3008750   Max.   :58.000                  Max.   :98.0000        
##  NA's   :28062                                                            
##  NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
##  Min.   : 0.00                Min.   : 0.0000                     
##  1st Qu.: 0.00                1st Qu.: 0.0000                     
##  Median : 1.00                Median : 0.0000                     
##  Mean   : 1.02                Mean   : 0.1267                     
##  3rd Qu.: 2.00                3rd Qu.: 0.0000                     
##  Max.   :54.00                Max.   :98.0000                     
##                                                                   
##  NumberOfDependents
##  Min.   : 0.000    
##  1st Qu.: 0.000    
##  Median : 0.000    
##  Mean   : 0.743    
##  3rd Qu.: 1.000    
##  Max.   :20.000    
##  NA's   :3745

Exploratory Data Analasys

Boxplot In Scale

auxData <- trainData[sample(nrow(trainData), round(0.05 * nrow(trainData))), 
    ] %>% dplyr::select(-X) %>% reshape::melt(id = "SeriousDlqin2yrs")
auxData <- auxData %>% inner_join(auxData %>% dplyr::group_by(variable) %>% 
    summarise(Max = max(value)) %>% data.frame(), by = "variable") %>% dplyr::mutate(ScaledVal = value/Max)



plotly::plot_ly(data = auxData, x = ~variable, y = ~ScaledVal, color = ~as.factor(SeriousDlqin2yrs)) %>% 
    plotly::add_boxplot(type = "box", boxpoints = "suspectedoutliers") %>% plotly::layout(boxmode = "group")